scrapify 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -8,6 +8,12 @@ ScrApify is a library to build APIs by scraping static sites and use data as mod
8
8
  $ gem install scrapify
9
9
  ```
10
10
 
11
+ If you're using Bundler, add this to your Gemfile:
12
+
13
+ ```
14
+ gem 'scrapify'
15
+ ```
16
+
11
17
  ### Usage
12
18
 
13
19
  Define html url and declare attributes using xpath or css selectors.
@@ -63,6 +69,34 @@ This will respond to two urls index and show with JSON:
63
69
 
64
70
  Jsonify currently has a limitation where the URLs /pizzas.json and /pizzas/1.json cannot be matched by the same map entry in Rack routes
65
71
 
66
- ### JSON API (Rails application example)
72
+ ### JSON API (Rails example)
73
+
74
+ Scrapify comes with a Rack application called Jsonify which can be used in rails routes to expose scraped models as JSON.
75
+
76
+ Check out this [Rails example](https://github.com/sathish316/jsonify_rails_example) for more details:
77
+
78
+ https://github.com/sathish316/jsonify_rails_example
79
+
80
+ 1 Add scrapify to Gemfile
81
+
82
+ ```
83
+ gem 'scrapify'
84
+ ```
85
+
86
+ 2 Define model to scrap data in app/models
87
+
88
+ ```
89
+ class Pizza
90
+ include Scrapify::Base
91
+ end
92
+ ```
93
+
94
+ 3 Add index and show API to routes
95
+
96
+ ```
97
+ pizza_api = Jsonify.new('/pizzas', Pizza)
98
+ get 'pizzas' => pizza_api
99
+ get 'pizzas/:id' => pizza_api
100
+ ```
67
101
 
68
- TODO
102
+ Jsonify scraps url and exposes index and show urls as JSON APIs
data/lib/jsonify.rb CHANGED
@@ -7,7 +7,8 @@ class Jsonify
7
7
  def call(env)
8
8
  path = env['REQUEST_PATH']
9
9
  response = path == @route ? all : one(find_id(path))
10
- [200, {"Content-Type" => "application/json"}, [response]]
10
+ header = @model.http_cache_header.merge("Content-Type" => "application/json")
11
+ [200, header, [response]]
11
12
  end
12
13
 
13
14
  private
@@ -23,4 +24,4 @@ class Jsonify
23
24
  def find_id(path)
24
25
  path[path.rindex('/') + 1, path.size]
25
26
  end
26
- end
27
+ end
data/lib/scrapify/base.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  module Scrapify
2
2
  module Base
3
+ HTTP_CACHE_HEADERS_TO_RETURN = %w(Cache-Control Last-Modified Age ETag)
3
4
  def self.included(klass)
4
5
  klass.extend ClassMethods
5
6
  klass.cattr_accessor :url, :doc, :attribute_names
@@ -28,9 +29,19 @@ module Scrapify
28
29
  add_attribute(name)
29
30
  parser = options[:xpath] ? :xpath : :css
30
31
  selector = options[parser]
32
+ matcher = /#{options[:regex]}/ if options[:regex]
33
+ to_array = options[:array]
31
34
  meta_define "#{name}_values" do
32
35
  self.doc ||= parse_html
33
- self.doc.send(parser, selector).map &:content
36
+ self.doc.send(parser, selector).map do |element|
37
+ content = element.content
38
+ if matcher
39
+ match_data = content.scan(matcher).map &:first
40
+ options[:array] ? match_data : match_data.first
41
+ else
42
+ content.strip
43
+ end
44
+ end
34
45
  end
35
46
  end
36
47
 
@@ -39,6 +50,12 @@ module Scrapify
39
50
  define_count attribute
40
51
  end
41
52
 
53
+ def http_cache_header
54
+ http_header.select do |(k, v)|
55
+ HTTP_CACHE_HEADERS_TO_RETURN.map(&:upcase).include?(k.upcase)
56
+ end
57
+ end
58
+
42
59
  private
43
60
 
44
61
  def add_attribute(name)
@@ -47,7 +64,21 @@ module Scrapify
47
64
  end
48
65
 
49
66
  def parse_html
50
- Nokogiri::HTML(open(url))
67
+ Nokogiri::HTML(html_content)
68
+ end
69
+
70
+ def html_content
71
+ http_response.body
72
+ end
73
+
74
+ def http_response
75
+ @http_response ||= Net::HTTP.get_response URI(url)
76
+ end
77
+
78
+ def http_header
79
+ http_response.header.to_hash.each_with_object({}) do |(k,v), hash|
80
+ hash[k] = v.first
81
+ end
51
82
  end
52
83
 
53
84
  def define_finders
@@ -1,3 +1,3 @@
1
1
  module Scrapify
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/scrapify.rb CHANGED
@@ -1,7 +1,8 @@
1
1
  require 'scrapify/version'
2
2
  require 'active_support/core_ext/class/attribute_accessors'
3
3
  require 'nokogiri'
4
- require 'open-uri'
4
+ require 'uri'
5
+ require 'net/http'
5
6
  require 'meta_define'
6
7
  require 'scrapify/base'
7
8
  require 'json'
data/spec/jsonify_spec.rb CHANGED
@@ -2,6 +2,11 @@ require 'spec_helper'
2
2
  require 'test_models'
3
3
 
4
4
  describe Jsonify do
5
+ before do
6
+ ::Pizza.stubs(:http_cache_header).returns("cache-control" => "private")
7
+ ::Pizza.stubs(:all).returns([])
8
+ end
9
+
5
10
  it "should find all objects and convert to json for index url" do
6
11
  pizzas = [{name: 'cheese'}, {name: 'chicken'}]
7
12
  ::Pizza.expects(:all).returns(pizzas)
@@ -25,4 +30,12 @@ describe Jsonify do
25
30
  header['Content-Type'].should == 'application/json'
26
31
  response.first.should == pizza.to_json
27
32
  end
28
- end
33
+
34
+ it "should forward the http cache headers" do
35
+ ::Pizza.expects(:http_cache_header).returns("cache-control"=>"private")
36
+
37
+ jsonify = Jsonify.new('/pizzas', ::Pizza)
38
+ status, header, response = jsonify.call('REQUEST_PATH' => '/pizzas')
39
+ header['cache-control'].should == "private"
40
+ end
41
+ end
data/spec/pizza.rb CHANGED
@@ -4,6 +4,8 @@ class Pizza
4
4
 
5
5
  attribute :name, css: ".menu_lft li a"
6
6
  attribute :image_url, xpath: "//li//input//@value"
7
+ attribute :price, css: ".price", regex: /([\d\.]+)/
8
+ attribute :ingredients, css: ".ingredients", regex: /contains (\w+)/, array: true
7
9
 
8
10
  key :name
9
11
  end
@@ -5,17 +5,43 @@ describe Scrapify do
5
5
 
6
6
  before do
7
7
  @pizza_url = "http://www.dominos.co.in/menuDetails_ajx.php?catgId=1"
8
- FakeWeb.register_uri :get, @pizza_url, :body => <<-HTML
8
+ FakeWeb.register_uri :get, @pizza_url,
9
+ :cache_control => "private, s-maxage=0, max-age=0, must-revalidate",
10
+ :age => 51592,
11
+ :length => 12312,
12
+ :body => <<-HTML
9
13
  <ul class="menu_lft">
10
- <li><a>chicken supreme</a><input value="chicken.jpg"></li>
11
- <li><a>veg supreme</a><input value="veg.jpg"></li>
12
- <li><a>pepperoni</a><input value="pepperoni.jpg"></li>
14
+ <li>
15
+ <a> chicken supreme </a><input value="chicken.jpg">
16
+ <span class='price'>(1.23)</span>
17
+ <span class='ingredients'>
18
+ <ol>
19
+ <li>contains corn</li>
20
+ <li>contains tomato</li>
21
+ <ol>
22
+ </span>
23
+ </li>
24
+ <li>
25
+ <a>veg supreme</a><input value="veg.jpg">
26
+ <span class='price'>(2.34)</span>
27
+ <span class='ingredients'>
28
+ <ol>
29
+ <li>contains mushroom</li>
30
+ <li>contains jalapeno</li>
31
+ <ol>
32
+ </span>
33
+ </li>
34
+ <li>
35
+ <a>pepperoni</a><input value="pepperoni.jpg">
36
+ <span class='price'>(3.45)</span>
37
+ <span class='ingredients'></span>
38
+ </li>
13
39
  </ul>
14
40
  HTML
15
41
  end
16
42
 
17
43
  it "should return attribute names" do
18
- ::Pizza.attribute_names.should == [:name, :image_url]
44
+ ::Pizza.attribute_names.should == [:name, :image_url, :price, :ingredients]
19
45
  end
20
46
 
21
47
  describe "html" do
@@ -30,6 +56,27 @@ describe Scrapify do
30
56
  it "should parse html and fetch attributes using xpath" do
31
57
  ::Pizza.image_url_values.should == ['chicken.jpg', 'veg.jpg', 'pepperoni.jpg']
32
58
  end
59
+
60
+ it "should parse html and extract attributes using regex" do
61
+ ::Pizza.price_values.should == ['1.23', '2.34', '3.45']
62
+ end
63
+
64
+ it "should parse html and extract multiple attributes using regex" do
65
+ ::Pizza.ingredients_values.should == [['corn','tomato'], ['mushroom','jalapeno'], []]
66
+ end
67
+
68
+ it "should strip content" do
69
+ ::Pizza.first.name.should == 'chicken supreme'
70
+ end
71
+
72
+ describe "cache headers" do
73
+ it "should return the http headers" do
74
+ ::Pizza.http_cache_header.should == {
75
+ "cache-control" => "private, s-maxage=0, max-age=0, must-revalidate",
76
+ "age" => 51592,
77
+ }
78
+ end
79
+ end
33
80
  end
34
81
 
35
82
  describe "find" do
@@ -80,14 +127,33 @@ describe Scrapify do
80
127
  describe "attributes" do
81
128
  it "should return attributes hash" do
82
129
  first_pizza = ::Pizza.first
83
- first_pizza.attributes.should == {name: "chicken supreme", image_url: "chicken.jpg"}
130
+ first_pizza.attributes.should == {
131
+ name: "chicken supreme",
132
+ image_url: "chicken.jpg",
133
+ price: '1.23',
134
+ ingredients: ['corn', 'tomato']
135
+ }
84
136
  end
85
137
  end
86
138
 
87
139
  describe "to_json" do
88
140
  it "should convert attributes to json" do
89
141
  first_pizza = ::Pizza.first
90
- first_pizza.to_json.should == {name: "chicken supreme", image_url: "chicken.jpg"}.to_json
142
+ first_pizza.to_json.should == {
143
+ name: "chicken supreme",
144
+ image_url: "chicken.jpg",
145
+ price: '1.23',
146
+ ingredients: ['corn', 'tomato']
147
+ }.to_json
148
+ end
149
+
150
+ it "should convert array to json" do
151
+ pizzas = ::Pizza.all
152
+ pizzas.to_json.should == [
153
+ {name: "chicken supreme", image_url: "chicken.jpg", price: '1.23', ingredients: ['corn', 'tomato']},
154
+ {name: "veg supreme", image_url: "veg.jpg", price: '2.34', ingredients: ['mushroom', 'jalapeno']},
155
+ {name: "pepperoni", image_url: "pepperoni.jpg", price: '3.45', ingredients: []},
156
+ ].to_json
91
157
  end
92
158
  end
93
- end
159
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapify
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-26 00:00:00.000000000Z
12
+ date: 2012-06-14 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70261744007960 !ruby/object:Gem::Requirement
16
+ requirement: &70352195064240 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70261744007960
24
+ version_requirements: *70352195064240
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: mocha
27
- requirement: &70261744006240 !ruby/object:Gem::Requirement
27
+ requirement: &70352195063700 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70261744006240
35
+ version_requirements: *70352195063700
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: fakeweb
38
- requirement: &70261743993080 !ruby/object:Gem::Requirement
38
+ requirement: &70352195062820 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70261743993080
46
+ version_requirements: *70352195062820
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70261743990880 !ruby/object:Gem::Requirement
49
+ requirement: &70352195062120 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70261743990880
57
+ version_requirements: *70352195062120
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: activesupport
60
- requirement: &70261743989080 !ruby/object:Gem::Requirement
60
+ requirement: &70352195061500 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70261743989080
68
+ version_requirements: *70352195061500
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: json
71
- requirement: &70261743987980 !ruby/object:Gem::Requirement
71
+ requirement: &70352195060200 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,7 +76,7 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70261743987980
79
+ version_requirements: *70352195060200
80
80
  description: ScrApify scraps static html sites to RESTlike APIs
81
81
  email:
82
82
  - sathish316@gmail.com